@inproceedings{yamamoto-matsuzaki-2023-absolute,
title = "Absolute Position Embedding Learns Sinusoid-like Waves for Attention Based on Relative Position",
author = "Yamamoto, Yuji and
Matsuzaki, Takuya",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.2",
doi = "10.18653/v1/2023.emnlp-main.2",
pages = "15--28",
abstract = "Attention weight is a clue to interpret how a Transformer-based model makes an inference. In some attention heads, the attention focuses on the neighbors of each token. This allows the output vector of each token to depend on the surrounding tokens and contributes to make the inference context-dependent. We analyze the mechanism behind the concentration of attention on nearby tokens. We show that the phenomenon emerges as follows: (1) learned position embedding has sinusoid-like components, (2) such components are transmitted to the query and the key in the self-attention, (3) the attention head shifts the phases of the sinusoid-like components so that the attention concentrates on nearby tokens at specific relative positions. In other words, a certain type of Transformer-based model acquires the sinusoidal positional encoding to some extent on its own through Masked Language Modeling.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="yamamoto-matsuzaki-2023-absolute">
<titleInfo>
<title>Absolute Position Embedding Learns Sinusoid-like Waves for Attention Based on Relative Position</title>
</titleInfo>
<name type="personal">
<namePart type="given">Yuji</namePart>
<namePart type="family">Yamamoto</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Takuya</namePart>
<namePart type="family">Matsuzaki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2023-12</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</title>
</titleInfo>
<name type="personal">
<namePart type="given">Houda</namePart>
<namePart type="family">Bouamor</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Juan</namePart>
<namePart type="family">Pino</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kalika</namePart>
<namePart type="family">Bali</namePart>
<role>
<roleTerm authority="marcrelator" type="text">editor</roleTerm>
</role>
</name>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Singapore</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Attention weight is a clue to interpret how a Transformer-based model makes an inference. In some attention heads, the attention focuses on the neighbors of each token. This allows the output vector of each token to depend on the surrounding tokens and contributes to make the inference context-dependent. We analyze the mechanism behind the concentration of attention on nearby tokens. We show that the phenomenon emerges as follows: (1) learned position embedding has sinusoid-like components, (2) such components are transmitted to the query and the key in the self-attention, (3) the attention head shifts the phases of the sinusoid-like components so that the attention concentrates on nearby tokens at specific relative positions. In other words, a certain type of Transformer-based model acquires the sinusoidal positional encoding to some extent on its own through Masked Language Modeling.</abstract>
<identifier type="citekey">yamamoto-matsuzaki-2023-absolute</identifier>
<identifier type="doi">10.18653/v1/2023.emnlp-main.2</identifier>
<location>
<url>https://aclanthology.org/2023.emnlp-main.2</url>
</location>
<part>
<date>2023-12</date>
<extent unit="page">
<start>15</start>
<end>28</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Absolute Position Embedding Learns Sinusoid-like Waves for Attention Based on Relative Position
%A Yamamoto, Yuji
%A Matsuzaki, Takuya
%Y Bouamor, Houda
%Y Pino, Juan
%Y Bali, Kalika
%S Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing
%D 2023
%8 December
%I Association for Computational Linguistics
%C Singapore
%F yamamoto-matsuzaki-2023-absolute
%X Attention weight is a clue to interpret how a Transformer-based model makes an inference. In some attention heads, the attention focuses on the neighbors of each token. This allows the output vector of each token to depend on the surrounding tokens and contributes to make the inference context-dependent. We analyze the mechanism behind the concentration of attention on nearby tokens. We show that the phenomenon emerges as follows: (1) learned position embedding has sinusoid-like components, (2) such components are transmitted to the query and the key in the self-attention, (3) the attention head shifts the phases of the sinusoid-like components so that the attention concentrates on nearby tokens at specific relative positions. In other words, a certain type of Transformer-based model acquires the sinusoidal positional encoding to some extent on its own through Masked Language Modeling.
%R 10.18653/v1/2023.emnlp-main.2
%U https://aclanthology.org/2023.emnlp-main.2
%U https://doi.org/10.18653/v1/2023.emnlp-main.2
%P 15-28
Markdown (Informal)
[Absolute Position Embedding Learns Sinusoid-like Waves for Attention Based on Relative Position](https://aclanthology.org/2023.emnlp-main.2) (Yamamoto & Matsuzaki, EMNLP 2023)
ACL