@inproceedings{wennberg-henter-2021-case,
title = "The Case for Translation-Invariant Self-Attention in Transformer-Based Language Models",
author = "Wennberg, Ulme and
Henter, Gustav Eje",
editor = "Zong, Chengqing and
Xia, Fei and
Li, Wenjie and
Navigli, Roberto",
booktitle = "Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2021.acl-short.18",
doi = "10.18653/v1/2021.acl-short.18",
pages = "130--140",
abstract = "Mechanisms for encoding positional information are central for transformer-based language models. In this paper, we analyze the position embeddings of existing language models, finding strong evidence of translation invariance, both for the embeddings themselves and for their effect on self-attention. The degree of translation invariance increases during training and correlates positively with model performance. Our findings lead us to propose translation-invariant self-attention (TISA), which accounts for the relative position between tokens in an interpretable fashion without needing conventional position embeddings. Our proposal has several theoretical advantages over existing position-representation approaches. Proof-of-concept experiments show that it improves on regular ALBERT on GLUE tasks, while only adding orders of magnitude less positional parameters.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="wennberg-henter-2021-case">
<titleInfo>
<title>The Case for Translation-Invariant Self-Attention in Transformer-Based Language Models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ulme</namePart>
<namePart type="family">Wennberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Gustav</namePart>
<namePart type="given">Eje</namePart>
<namePart type="family">Henter</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-08</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)</title>
</titleInfo>
<name type="personal">
<namePart type="given">Chengqing</namePart>
<namePart type="family">Zong</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Fei</namePart>
<namePart type="family">Xia</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wenjie</namePart>
<namePart type="family">Li</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Roberto</namePart>
<namePart type="family">Navigli</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Mechanisms for encoding positional information are central for transformer-based language models. In this paper, we analyze the position embeddings of existing language models, finding strong evidence of translation invariance, both for the embeddings themselves and for their effect on self-attention. The degree of translation invariance increases during training and correlates positively with model performance. Our findings lead us to propose translation-invariant self-attention (TISA), which accounts for the relative position between tokens in an interpretable fashion without needing conventional position embeddings. Our proposal has several theoretical advantages over existing position-representation approaches. Proof-of-concept experiments show that it improves on regular ALBERT on GLUE tasks, while only adding orders of magnitude less positional parameters.</abstract>
<identifier type="citekey">wennberg-henter-2021-case</identifier>
<identifier type="doi">10.18653/v1/2021.acl-short.18</identifier>
<location>
<url>https://aclanthology.org/2021.acl-short.18</url>
</location>
<part>
<date>2021-08</date>
<extent unit="page">
<start>130</start>
<end>140</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T The Case for Translation-Invariant Self-Attention in Transformer-Based Language Models
%A Wennberg, Ulme
%A Henter, Gustav Eje
%Y Zong, Chengqing
%Y Xia, Fei
%Y Li, Wenjie
%Y Navigli, Roberto
%S Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing (Volume 2: Short Papers)
%D 2021
%8 August
%I Association for Computational Linguistics
%C Online
%F wennberg-henter-2021-case
%X Mechanisms for encoding positional information are central for transformer-based language models. In this paper, we analyze the position embeddings of existing language models, finding strong evidence of translation invariance, both for the embeddings themselves and for their effect on self-attention. The degree of translation invariance increases during training and correlates positively with model performance. Our findings lead us to propose translation-invariant self-attention (TISA), which accounts for the relative position between tokens in an interpretable fashion without needing conventional position embeddings. Our proposal has several theoretical advantages over existing position-representation approaches. Proof-of-concept experiments show that it improves on regular ALBERT on GLUE tasks, while only adding orders of magnitude less positional parameters.
%R 10.18653/v1/2021.acl-short.18
%U https://aclanthology.org/2021.acl-short.18
%U https://doi.org/10.18653/v1/2021.acl-short.18
%P 130-140
Markdown (Informal)
[The Case for Translation-Invariant Self-Attention in Transformer-Based Language Models](https://aclanthology.org/2021.acl-short.18) (Wennberg & Henter, ACL-IJCNLP 2021)
ACL